Packages To Be Used

library(anomalize)
library(dplyr)
library(FinTS)
library(forecast)
library(fpp2)
library(fUnitRoots) 
library(ggplot2)
library(gridExtra)
library(hrbrthemes)
library(lmtest)
library(lubridate)
library(pdR)
library(plotly)
library(RColorBrewer)
library(tibbletime)
library(tidyverse)
library(timetk)
library(TSA)
library(tseries)
library(xts)
setwd("C:/Users/Cesur/Desktop")
data <- read.csv("Project.csv")
data <- data %>% select(Tarih,Açılış)
data$Tarih <- as.Date(data$Tarih,format="%d.%m.%Y")
data <- data[order(data$Tarih,decreasing = FALSE),]
data[,2] <- as.numeric(gsub(",", ".", data[,2]))
dim(data)
## [1] 109   2

The data consists of 7 columns and 109 rows. First column is Date. The others are continous variables.

head(data)
##          Tarih Açılış
## 109 2014-12-01   9.41
## 108 2015-01-01   9.60
## 107 2015-02-01   9.28
## 106 2015-03-01   9.08
## 105 2015-04-01   8.62
## 104 2015-05-01   8.85
tail(data)
##        Tarih Açılış
## 6 2023-07-01  203.5
## 5 2023-08-01  232.4
## 4 2023-09-01  245.1
## 3 2023-10-01  242.5
## 2 2023-11-01  220.3
## 1 2023-12-01  255.5
sum(is.na(data))
## [1] 0
summary(data)
##      Tarih                Açılış      
##  Min.   :2014-12-01   Min.   :  4.82  
##  1st Qu.:2017-03-01   1st Qu.:  8.94  
##  Median :2019-06-01   Median : 12.75  
##  Mean   :2019-06-01   Mean   : 35.12  
##  3rd Qu.:2021-09-01   3rd Qu.: 17.45  
##  Max.   :2023-12-01   Max.   :255.50

I will concentrate on Açılış columnn on my project.

The source of data is https://tr.investing.com/equities/turk-hava-yollari-historical-data.

Açılış variable varies with price of 4.820 ₺ to 255.50 ₺ Its mean is 35.12 ₺.

serhat <- data %>%
  ggplot(aes(x= Tarih, y= Açılış)) +
  geom_area(fill="blue", alpha=0.5) +
  geom_line(color="black") +
  ylab("THYAO Price") + 
  labs(title="Turkish Airlines Price (THYAO) from 2014-12 to 2023-12")+
  theme_ipsum()
serhat <-  ggplotly(serhat)
serhat

It is seen that there is an increasing trend at the graph. There is no seasonality in the data.

Spliting Data

datap <- ts(data[,2],start = c(2014,12),end=c(2023,12),frequency = 12)
train <- ts(datap[1:97], start = c(2014, 12), frequency = 12)
test <- ts(datap[98:109],start=c(2023,1),end=c(2023,12),frequency = 12)

We split our data as train and test. Test shows that last 12 observation of the data.

Anomalize Detection

df <- data[1:97,]
df <- as_tibble(df)
# df %>% 
  #time_decompose(Açılış, method = "stl", frequency = "auto", trend = "auto") %>%
  #anomalize(remainder,method = "gesd", alpha = 0.05, max_anoms = 0.2) %>%
  #plot_anomaly_decomposition()

Box-Cox Transformation

BoxCox.ar(train,method = c("ols"))

lambda1 <- BoxCox.lambda(train)
lambda1
## [1] -0.4406391
train_box <- BoxCox(train,lambda1)
autoplot(train_box,"Transformed Time Series Plot Of THYAYO Price",color="darkblue") + theme_minimal()

We did Box-Cox transformation because lambda value equals to -0.44 which is not equal to 0. The series is not still stationary. because there is increasing and decreasing trend in series.

library(RColorBrewer)
rb<-brewer.pal(7,"Blues")
par(mfrow=c(1,2))
plot(train, col=rb[7], xlab = "Original Series", ylab="THYAO Price")
plot(train_box, col=rb[4], xlab = "Differenced Series",  ylab="THYAO Price")  

Original VS Transformed

p1<-ggAcf(train_box,lag.max = 96) + theme_minimal() + ggtitle("ACF Of Transformed Data")
p2<-ggPacf(train_box,lag.max = 96)+ theme_minimal() + ggtitle("PACF Of Transformed Data")
grid.arrange(p1,p2,ncol=2)

As it is seen that, there is a slow decay in ACF graph, meaning that the series is not stationary. There is no need to check PACF.

Unit - Root Tests

kpss.test(train_box,null="Level")
## Warning in kpss.test(train_box, null = "Level"): p-value smaller than printed
## p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  train_box
## KPSS Level = 1.4779, Truncation lag parameter = 3, p-value = 0.01
kpss.test(train_box,null="Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  train_box
## KPSS Trend = 0.15655, Truncation lag parameter = 3, p-value = 0.0412

The p - value of KPSS Test for Level Stationary is 0.01 < 0.05. Therefore, we reject H0 and we can conclude that the series is not stationary. In addition, The p - value of KPSS Test for Trend Stationary is 0.0412 < 0.05. Hence, we can say that the series has stochastic trend. We need to take difference to remove stochastic trend.

pp.test(train_box)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  train_box
## Dickey-Fuller Z(alpha) = -3.9635, Truncation lag parameter = 3, p-value
## = 0.8868
## alternative hypothesis: stationary

Since p value is greater than α=0.05 , we fail to reject H0 . It means that we don’t have enough evidence to claim that we have a stationary system.

mean(train_box)
## [1] 1.499884

The mean of stock prices is not 0 or close to 0. In such cases, we will prefer to use ADF test with c

adfTest(train_box, lags=1, type="c") 
## 
## Title:
##  Augmented Dickey-Fuller Test
## 
## Test Results:
##   PARAMETER:
##     Lag Order: 1
##   STATISTIC:
##     Dickey-Fuller: 0.603
##   P VALUE:
##     0.9888 
## 
## Description:
##  Sun Jan 21 03:18:54 2024 by user: Cesur

Since p value is greater than α=0.05 , we fail to reject H0 = The series is not stationary It means that the series is not stationary.

adfTest(train_box, lags=1, type="ct") 
## 
## Title:
##  Augmented Dickey-Fuller Test
## 
## Test Results:
##   PARAMETER:
##     Lag Order: 1
##   STATISTIC:
##     Dickey-Fuller: -1.1308
##   P VALUE:
##     0.9136 
## 
## Description:
##  Sun Jan 21 03:18:54 2024 by user: Cesur

Since p value is greater than α=0.05 , we fail to reject H0. It means that we have non stationary system having stochastic trend.

ndiffs(train_box)
## [1] 1

It is enough to take 1 difference for the series to make stationary.

diff_train <- diff(train_box)

It seems stationary after taking difference. Let’s check it with unit root tests.

kpss.test(diff_train,null="Level")
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff_train
## KPSS Level = 0.35046, Truncation lag parameter = 3, p-value = 0.09851

P - value of KPSS test is 0.09851 which is higher than 0.05, so we cannot reject null hypothesis (H0) and we can conclude that the differenced series are stationary.

adfTest(diff_train, type="nc")
## Warning in adfTest(diff_train, type = "nc"): p-value smaller than printed
## p-value
## 
## Title:
##  Augmented Dickey-Fuller Test
## 
## Test Results:
##   PARAMETER:
##     Lag Order: 1
##   STATISTIC:
##     Dickey-Fuller: -5.9011
##   P VALUE:
##     0.01 
## 
## Description:
##  Sun Jan 21 03:18:54 2024 by user: Cesur

It is enough to check adftest with “nc” because we made the constant part zero. Since its p value < 0.05, the series is stationary.

pp.test(diff_train)
## Warning in pp.test(diff_train): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff_train
## Dickey-Fuller Z(alpha) = -100.65, Truncation lag parameter = 3, p-value
## = 0.01
## alternative hypothesis: stationary

Since p value is less than α, we reject H0. We can conclude that the differenced series is stationary.

autoplot(diff_train,main="Time Series Plot of the One - Differenced Data",xlab="THYAO Price",col="blue")

p3<-ggAcf(diff_train,lag.max = 96) + theme_minimal() + ggtitle("ACF Of One - Differenced Data")
p4<-ggPacf(diff_train,lag.max = 96)+ theme_minimal() + ggtitle("PACF Of One - Differenced Data")
grid.arrange(p3,p4,ncol=2)

The series become stationary after taking one difference. Since there is no seasonality in the data, there is no need to check HEGY.TEST. One differencing is enough. I will suggest a model according to results of ACP-PACF graphs.

We can suggest \(ARIMA(0,1,0)\) called as random walk. We can also suggest \(ARIMA(7,1,7)\) .

fit1 <-Arima(train_box,order = c(0,1,0), seasonal = c(0, 0,0))
fit1
## Series: train_box 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.001657:  log likelihood = 171.11
## AIC=-340.22   AICc=-340.18   BIC=-337.65
fit2 <-Arima(train_box,order = c(7,1,7), seasonal = c(0, 0,0))
fit2
## Series: train_box 
## ARIMA(7,1,7) 
## 
## Coefficients:
##           ar1     ar2      ar3     ar4     ar5      ar6      ar7     ma1
##       -0.0576  0.3741  -0.0536  0.1110  0.4824  -0.1187  -0.4888  0.0751
## s.e.   0.1424  0.1320   0.1244  0.1194  0.1190   0.1324   0.1324  0.1116
##           ma2      ma3     ma4      ma5     ma6    ma7
##       -0.3469  -0.0249  0.0250  -0.4171  0.1724  0.948
## s.e.   0.1054   0.1017  0.1026   0.1007  0.1101  0.122
## 
## sigma^2 = 0.001361:  log likelihood = 182.52
## AIC=-335.03   AICc=-329.03   BIC=-296.56

Since \(\phi_7\) and \(\theta_7\) are significant, \(ARIMA(7,1,7)\) is appropriate.

a<-fit1$aic
b<-fit1$bic
c<-fit2$aic
d<-fit2$bic
cat("Model 1 AIC:", a, ", Model 1 BIC:", b, "\n")
## Model 1 AIC: -340.2178 , Model 1 BIC: -337.6535
cat("Model 2 AIC:", c, ", Model 2 BIC:", d, "\n")
## Model 2 AIC: -335.0301 , Model 2 BIC: -296.5649

Since AIC and BIC of fit 1 is smaller than fit 2, we choose \(ARIMA(0,1,0)\).

auto.arima(train_box)
## Series: train_box 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.001657:  log likelihood = 171.11
## AIC=-340.22   AICc=-340.18   BIC=-337.65

Also, auto.arima suggests ARIMA (0,1,0) for the series.

Diagnostic Checking

Jarque Bera and Shapiro Test

r1=resid(fit1)/sd(residuals(fit1))
jarque.bera.test(r1)
## 
##  Jarque Bera Test
## 
## data:  r1
## X-squared = 0.70911, df = 2, p-value = 0.7015
shapiro.test(r1)
## 
##  Shapiro-Wilk normality test
## 
## data:  r1
## W = 0.99064, p-value = 0.735

Since p value is bigger than alpha(0.05) , we reject H0. Hence, Normality Assumption is satisfied.

Let’s check the QQ - Plot of it.

ggplot(r1, aes(sample = r1)) +stat_qq()+geom_qq_line(col="red")+ggtitle("QQ Plot of the Residuals")+theme_minimal()

Breusch-Godfrey Test

m = lm(r1 ~ 1+zlag(r1))
bgtest(m,order=12) 
## 
##  Breusch-Godfrey test for serial correlation of order up to 12
## 
## data:  m
## LM test = 14.689, df = 12, p-value = 0.2589

Since p - value is greater than alpha(0.05), we have 95% confident that the residuals of the model are uncorrelated, according to the result of Breusch-Godfrey Test.

ARCH Engle’s Test

ArchTest(r1)
## 
##  ARCH LM-test; Null hypothesis: no ARCH effects
## 
## data:  r1
## Chi-squared = 9.6194, df = 12, p-value = 0.6493

Since p value = 0.6493 > 0.05 , we fail to reject H0. Hence, we can say that there is no presence of ARCH effects. Therefore, e do not need to use ARCH and GARCH model.

Forecasting

Back - Transformation

f<-forecast(fit1,h=12)
accuracy(f,test)
##                       ME         RMSE          MAE        MPE      MAPE
## Training set   0.0059151   0.04049819   0.03240554  0.3116619  2.178828
## Test set     182.7450093 189.58227554 182.74500933 98.8327941 98.832794
##                     MASE       ACF1 Theil's U
## Training set    0.203766 0.01137959        NA
## Test set     1149.100252 0.76983534  6.670223
accuracy(f,test)[1,7]
## [1] 0.01137959

Forecast values are produced with respect to transformed data here. We should apply Back-Transformation.

back<-exp(f$mean)
accuracy(back,test)
##                ME     RMSE      MAE      MPE     MAPE      ACF1 Theil's U
## Test set 177.3773 184.4137 177.3773 95.69492 95.69492 0.7698353  6.468543
het <- resid(fit1)/sd(residuals(fit1))
hete <- het^2
g1<-ggAcf(as.vector(hete),lag.max = 72)+theme_minimal()+ggtitle("ACF of Squared Residuals")
g2<-ggPacf(as.vector(hete),lag.max = 72)+theme_minimal()+ggtitle("PACF of Squared Residuals")
grid.arrange(g1,g2,ncol=2)

Homoscedasticity is checked in the code above. Almost all spikes are in of the white noise bands that is an indication of homoscedasticity.

Modelling

ETS

fitnew=ets(train,model="ZZZ")
fitnew
## ETS(M,A,N) 
## 
## Call:
##  ets(y = train, model = "ZZZ") 
## 
##   Smoothing parameters:
##     alpha = 0.7781 
##     beta  = 0.1759 
## 
##   Initial states:
##     l = 9.6632 
##     b = -0.3211 
## 
##   sigma:  0.1365
## 
##      AIC     AICc      BIC 
## 550.3300 550.9893 563.2035
Fr1=forecast(fitnew,h=12)
autoplot(Fr1) + autolayer(fitted(Fr1),series="fitted")+theme_minimal()

Normality Checking for ETS

r=resid(fitnew)/sd(residuals(fitnew))
jarque.bera.test(r)
## 
##  Jarque Bera Test
## 
## data:  r
## X-squared = 1.5811, df = 2, p-value = 0.4536

Since p - value is bigger than alpha (0.05) , we reject H0. Hence, Normality Assumption is satisfied.

Accuracy Test

accuracy(Fr1,test)
##                       ME      RMSE       MAE       MPE     MAPE      MASE
## Training set   0.7749416  4.027892  1.984933   1.54562 10.01209 0.2166595
## Test set     -18.2301631 30.339217 23.286348 -12.63560 15.24538 2.5417533
##                    ACF1 Theil's U
## Training set 0.08253673        NA
## Test set     0.50447244  1.421913

TBATS

tbatsmodel<-tbats(train)
tbatsmodel
## BATS(0, {0,0}, 1, -)
## 
## Call: tbats(y = train)
## 
## Parameters
##   Lambda: 0
##   Alpha: 0.801937
##   Beta: 0.152849
##   Damping Parameter: 1
## 
## Seed States:
##            [,1]
## [1,] 2.36459508
## [2,] 0.05025062
## attr(,"lambda")
## [1] 7.52242e-09
## 
## Sigma: 0.1270689
## AIC: 545.1156
autoplot(train,main="TS plot of Train with TBATS Fitted") + autolayer(fitted(tbatsmodel), series="Fitted") +theme_minimal()

tbats_forecast<-forecast(tbatsmodel,h=12)
autoplot(tbats_forecast)+theme_minimal()

Normality Checking for TBATS

rr=resid(tbatsmodel)/sd(residuals(tbatsmodel))
jarque.bera.test(rr)
## 
##  Jarque Bera Test
## 
## data:  rr
## X-squared = 0.1476, df = 2, p-value = 0.9289

Since p - value is bigger than alpha (0.05) , we reject H0. Hence, Normality Assumption is satisfied.

Accuracy Test

accuracy(tbats_forecast,test)
##                       ME       RMSE        MAE           MPE      MAPE
## Training set    0.351754   3.558368   1.941747   -0.02715223  10.26801
## Test set     -231.990450 299.177908 231.990450 -112.82190062 112.82190
##                    MASE       ACF1 Theil's U
## Training set  0.2119458 -0.1697709        NA
## Test set     25.3222404  0.6908177  9.343687

NNETAR

nnet_for <- nnetar(train,P=0)
nnet_for
## Series: train 
## Model:  NNAR(1,1) 
## Call:   nnetar(y = train, P = 0)
## 
## Average of 20 networks, each of which is
## a 1-1-1 network with 4 weights
## options were - linear output units 
## 
## sigma^2 estimated as 12.59
fcast <- forecast(nnet_for,h=12)
autoplot(fcast) + autolayer(fitted(fcast),series="fitted")+theme_minimal()
## Warning: Removed 1 row containing missing values (`geom_line()`).

Normality Checking for NNETAR

shapiro.test(residuals(nnet_for))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(nnet_for)
## W = 0.75404, p-value = 2.21e-11

Since p - value is smaller than alpha (0.05) , we reject H0. Hence, Normality Assumption is not satisfied.

Accuracy Test

accuracy(fcast,test)
##                        ME      RMSE       MAE       MPE     MAPE     MASE
## Training set 4.140114e-05  3.547983  1.921649 -1.391268 10.32818 0.209752
## Test set     2.635436e+01 54.588206 45.558700  7.849195 22.72422 4.972827
##                    ACF1 Theil's U
## Training set -0.4650233        NA
## Test set      0.7700396  1.766325

Comparing Train

etsac <- accuracy(Fr1,test)[1, 1:5]
tbatsac <- accuracy(tbats_forecast,test)[1, 1:5]
nnetarac <- accuracy(fcast,test)[1, 1:5]
arimaac <- accuracy(f,test)[1, 1:5]

etsac <- as.data.frame(etsac)
nnetarac <- as.data.frame(nnetarac)
tbatsac <- as.data.frame(tbatsac)
arimaac <- as.data.frame(arimaac)

ets7 <- accuracy(Fr1, test)[1,7]
tbats7 <- accuracy(tbats_forecast,test)[1,7]
nnetar7 <- accuracy(fcast,test)[1,7]
arima7 <- accuracy(f,test)[1,7]

ets7 <- as.data.frame(ets7)
tbats7 <- as.data.frame(tbats7)
nnetar7 <- as.data.frame(nnetar7)
arima7 <- as.data.frame(arima7)

acf_train <- cbind(etsac,nnetarac,tbatsac,arimaac)
acf7 <- cbind(ets7,tbats7,nnetar7,arima7)
colnames(acf7) <- c("etsac","nnetarac","tbatsac","arimaac")
accuracycompare <- rbind(acf_train, acf7)

colnames(accuracycompare) <- c("ETS","TBATS","NNETAR","ARIMA")
rownames(accuracycompare) <- c('ME', 'RMSE', 'MAE', 'MPE', 'MAPE', 'ACF1')
round(accuracycompare,2)
##        ETS TBATS NNETAR ARIMA
## ME    0.77  0.00   0.35  0.01
## RMSE  4.03  3.55   3.56  0.04
## MAE   1.98  1.92   1.94  0.03
## MPE   1.55 -1.39  -0.03  0.31
## MAPE 10.01 10.33  10.27  2.18
## ACF1  0.08 -0.17  -0.47  0.01

According to accuracy of train values, the best model is ARIMA with the lowest values.

Comparing Forecast

etstr <- accuracy(Fr1,test)[2, 1:5]
tbatstr <- accuracy(tbats_forecast,test)[2, 1:5]
nnetartr <- accuracy(fcast, test)[2, 1:5]
arimatr <- accuracy(f,test)[2, 1:5]

etstr <- as.data.frame(etstr)
tbatstr <- as.data.frame(tbatstr)
nnetartr <- as.data.frame(nnetartr)
arimatr <- as.data.frame(arimatr)
  
etstr7 <- accuracy(Fr1,test)[2, 7]
tbatstr7 <- accuracy(tbats_forecast,test)[2, 7]
nnetartr7 <- accuracy(fcast, test)[2, 7]
arimatr7 <- accuracy(back,test)[7]

etstr7<- as.data.frame(etstr7)
tbatstr7 <- as.data.frame(tbatstr7)
nnetartr7 <- as.data.frame(nnetartr7)
arimatr7 <- as.data.frame(arimatr7)

acf_test <- cbind(etstr,tbatstr,nnetartr,arimatr)
acf7_test <- cbind(etstr7,tbatstr7,nnetartr7,arimatr7)
colnames(acf7_test) <- c("etstr","tbatstr","nnetartr","arimatr")
accuracytestcompare <- rbind(acf_test, acf7_test)

colnames(accuracytestcompare) <- c("ETS","TBATS","NNETAR","ARIMA")
rownames(accuracytestcompare) <- c('ME', 'RMSE', 'MAE', 'MPE', 'MAPE', 'ACF1')
round(accuracytestcompare,2)
##         ETS   TBATS NNETAR  ARIMA
## ME   -18.23 -231.99  26.35 182.75
## RMSE  30.34  299.18  54.59 189.58
## MAE   23.29  231.99  45.56 182.75
## MPE  -12.64 -112.82   7.85  98.83
## MAPE  15.25  112.82  22.72  98.83
## ACF1   0.50    0.69   0.77   6.47

According to accuracy of forecasting values, the best model is ETS with lowest RMSE and AICF.

clrs <- c("green", "red", "blue")
autoplot(Fr1) +
  autolayer(Fr1$mean, series = "Forecast", size = 1) +
  autolayer(fitted(fitnew), series = "Fitted", size = 1) +
  autolayer(datap, series ="Data", size = 1) +
  ylab("") +
  scale_color_manual(values = clrs )  + geom_vline(xintercept = 2023+(01-1)/12)